library(ngsReports)
library(edgeR)
library(magrittr)
library(scales)
library(tidyverse)
library(ggrepel)
library(pander)
theme_set(theme_bw())
rawFqc <- list.files("../0_rawData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
trimFqc <- list.files("../1_trimmedData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
alnFqc <- list.files("../2_alignedData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
rt <- list(
readTotals(rawFqc) %>% mutate(Stage = "Raw"),
readTotals(trimFqc) %>% mutate(Stage = "Trimmed"),
readTotals(alnFqc) %>% mutate(Stage = "Aligned")
) %>%
bind_rows() %>%
mutate(Sample = str_remove_all(Filename, "(_R1.fq.gz|Aligned.sortedByCoord.out.bam)"),
Sample = factor(Sample, levels = str_sort(unique(Sample), numeric = TRUE)),
Stage = factor(Stage, levels = c("Raw", "Trimmed", "Aligned")))
Summary of library sizes after each step. The slight increase after alignments indicates a low level of multiple alignments
No unusual behaviour was found in GC content.
GC content for raw data
GC content for aligned data
No issues were evident with regard to duplication levels
Duplication levels for raw data
Duplication levels for aligned data
Alignment rates were also very good
alnLogs <- list.files("../2_alignedData/logs/", full.names = TRUE, pattern = "final.out") %>%
importStarLogs()
Alignment summaries for all libraries (before merging replicates)
Gene-level counts were loaded without filtering and an MDS plot generated in order to check whether replicate libraries grouped correctly.
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_names(basename(colnames(.))) %>%
set_names(str_remove_all(colnames(.), "Aligned.sortedByCoord.out.bam"))
dge <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
dge$samples %<>%
rownames_to_column("library") %>%
mutate(Genotype = str_extract_all(library, "(FAD|FS|__)"),
Genotype = str_replace_all(Genotype, "__", "WT"),
Genotype = factor(Genotype, levels = c("WT", "FS", "FAD")),
Sample = str_replace(library, "__-", "_WT"),
Sample = ifelse(str_count(Sample, "_") == 3, Sample, paste0(Sample, "_1")),
Replicate = str_extract(Sample, "[12]$"),
Sample = str_remove(Sample, "_[12]$"),
group = as.integer(Genotype)) %>%
column_to_rownames("library")
mds <- plotMDS(dge, plot = FALSE) %>%
extract2("cmdscale.out") %>%
set_colnames(paste0("Dim", 1:2)) %>%
as.data.frame() %>%
rownames_to_column("Library") %>%
as_tibble()
Plot of replicate libraries showing good concordance between replicates. These will be combined and low-expressed genes removed for the DE analysis
It was noted that library sizes were about 10-20 fold smaller than expected after summarising to gene level counts and these ranged between 497,360 and 795,879. Alignments were then inspected manually in order to ascertain the cause of this low counting rate.